####################################################  
#Author: Kelli Marquardt
#Purpose: Prepare the DSM text and the patient note text for NLP algorithm match 

# Inputs:
#- data/DSM_sx_combined.csv
#- data/intermediate/cleaned_appt_notes_fake.csv

# Outputs:
#- data/intermediate/DSM_Vectors.csv
#- data/intermediate/pat_notes_fake_Vectors.csv

####################################################

############################
#0 load required packages
############################
rm(list = ls(all.names = TRUE))

#load packages 
library(dplyr) 
library(textstem) #for stemming
library(parallel)
library(parallelMap)
library(pbapply) #for apply function progress bar 
library(stringr) #for str_detect
library(data.table) #fread and fwrite
library(dplyr)
library(tidytext) #for unnest tokens
library(tm) #for less restrictive set of stop words 
library(textclean) #for replace_contraction
library(NLP) #for POS 

# Verify that openNLP can be loaded
if (!requireNamespace("openNLP", quietly = TRUE)) {
  stop(
    paste(
      "Package 'openNLP' could not be loaded.",
      "See README: section 3.2 for 'Java / openNLP troubleshooting'.",
      sep = "\n"
    )
  )
}

library(openNLP) #for POS

##############
#Note: wordnet is required for synonym replacement. Need to download wordnet dictionary first. See instructions in README and Step0_InstallPackages.R.
#replace the WordNet dictionary location in setDict below. 
library(wordnet) #for synonym word replacement
setDict("C:/Program Files (x86)/WordNet/2.1/dict")


############################
#0 Make sure GLOVE word embedding are downloaded (see instructions in README and Step0_InstallPackages.R)
############################
#Notes: close-word-stem based on word embedding distance from GloVe: Global Vectors for Word Representation
glove_dir = "../data/Glove"
glove_file = file.path(glove_dir, "glove.6B.300d.txt")

if (!file.exists(glove_file)) {
  stop("Make sure GloVe embeddings are saved in data/Glove")
}

### 
#set up embedding environment (note: this chunk of code can take a few minutes to run- high RAM) 
lines = readLines(glove_file)
embeddings_index = new.env(hash = TRUE, parent = emptyenv())
gc()

for (line in lines) {
  values = strsplit(line, " ", fixed = TRUE)[[1]]
  embeddings_index[[values[1]]] = as.numeric(values[-1])
}
rm(lines, line, values)
gc()

words=ls(embeddings_index)
embedding_mat=pbsapply(words, function(word) embeddings_index[[word]])
colnames(embedding_mat)=words
rm(words,embeddings_index)



#################################################
#0: define functions used in code  
#################################################

#replace contractions, spaces after comma, lowercase, remove white space 
#note- the gsub was specific to cleaning true de-identified data so commented out in replication code
clean_notes_specific = function(x) {
  x <- textclean::add_comma_space(textclean::replace_contraction(x))
  x <- tolower(x)
  #x <- gsub("[^a-zA-Z0-9 ]|[0-9]|xx+", "  ", x)
  x <- stringr::str_squish(x)
  return(x)
}


###
#make cosine similarity more efficient with matrix rather than loop 
#get t(embedding_mat) and sqrt(rowSums(t(embedding_mat)^2) outside of function 
t_embedding_mat=t(embedding_mat)
sqrt_embedding_sum_sq=sqrt(rowSums(t_embedding_mat^2))
cosine_sim=function(vec){
  sim= (t_embedding_mat %*% vec)/(sqrt_embedding_sum_sq*sqrt(sum(vec^2)))
  order(sim, decreasing = T)[1:10]
}

#define multiple stemming function to stem words up to n_stem times (n_stem=3 times gives shortest stem) 
stem_fn=function(word, n_stem){
  stem_n=word
  n=1
  while(n<=n_stem){
    stem_n=textstem::stem_words(stem_n)
    n=n+1
  }
  rm(n)
  return(stem_n)
}

#function to return the the closest stem for a given word 
getNearestWords2=function(word, n_stem){
  
  # if word is not in embeddings matrix, return the word stem
  if (!(word %in% colnames(embedding_mat))) {
    return(stem_fn(word, n_stem))
  }
  
  #if word is in embeddings matrix, get the cosine similarity with all words and return the top 10   
  word_vec = embedding_mat[, word, drop = FALSE]  # keep as column matrix
  sim_indices = cosine_sim(word_vec)
  top_rank = colnames(embedding_mat)[sim_indices]
  
  #stem the top ten,and filter out the original word if it (or its stem) is present in top results 
  top_rank=stem_fn(top_rank, n_stem)
    
  top_rank=top_rank[top_rank != stem_fn(word, n_stem)]
    
    #return the top ranked closest word stem 
    return(top_rank[1])
}


#function for POS tagging 
pos_tagging=function(note){
  init_s_w = annotate(note, list(Maxent_Sent_Token_Annotator(),
                                 Maxent_Word_Token_Annotator()))
  pos_res = annotate(note, Maxent_POS_Tag_Annotator(), init_s_w)
  word_subset = subset(pos_res, type=='word')
  tags = sapply(word_subset$features , '[[', "POS")
  return(tags)
}



#################################################
#Step 1: read in the DSM symptom list text  
#################################################
### read in symptom text 
symptom_orig=read.csv("../data/DSM_sx_combined.csv")


#apply clean_notes_specific() function 
symptom_clean=symptom_orig%>%
  mutate(symptom_text=clean_notes_specific(symptom_text))


#################################################
#Step 2: Get word to synonym mapping
#################################################

#first, do POS tagging based on the full symptom text. 
symptom_pos_tag=symptom_clean
symptom_pos_tag=symptom_pos_tag %>%
  unnest_tokens(word, symptom_text, to_lower=T,drop=F, strip_punct=F)
symptom_pos_tag$tag=0

#apply the POS tag function to both symptom lists 
for (i in 1:2){
  tags=pos_tagging(symptom_orig[which(symptom_orig$type==i),"symptom_text"])
  symptom_pos_tag[which(symptom_pos_tag$type==i),"tag"]=tags
}

### 
#Keep only words that are either most common adj, noun, verb, or adverb. 
#also remove stop words, net negation 

#defining most common adj, noun, verb, adv tags
adjective_list=c("JJ", "JJR", "JJS")
noun_list=c("NN", "NNS")
verb_list=c("VB","VBD","VBG","VBN", "VBP", "VBZ")
adverb_list=c("RB","RBR","RBS")

#defining negate_words and stop_words_sub (stop words net negation)
#note- negat_word list includes those from qdapDictionaries::negation.words plus others relevant to symptom and note text
negate_words=c('no', 'not', 'rather','wont', 'never','none', 'nobody','nothing', 
               'neither','nor','non', 'nowhere','cannot','without', 'hardly',
               'less','little','rarely', 'scarcely', 'seldom', 'barely', 'denied',
               'denies','deny', 'fails','fail', 'failed', 'difficulty','difficult', 'difficulties')
negate_words=as.data.frame(negate_words)
colnames(negate_words)="word"

stop_words_full=as.data.frame(tolower(replace_contraction(tm::stopwords(), sent.cap=F)))
colnames(stop_words_full)="word"
stop_words_full=stop_words_full%>%
  unnest_tokens(word, word, to_lower=T, drop=T, strip_punct=T)%>%
  distinct()

#define stop_words_sub as all stop_words net of negate_words
stop_words_sub=anti_join(stop_words_full,negate_words, by="word")
rm(stop_words_full)



#assign each word to POS tag 
symptom_pos_tag$POS=0
for (i in 1:nrow(symptom_pos_tag)){
  if (symptom_pos_tag$tag[i] %in% adjective_list){
    symptom_pos_tag$POS[i]="ADJECTIVE"
  }
  if (symptom_pos_tag$tag[i] %in% noun_list){
    symptom_pos_tag$POS[i]="NOUN"
  } 
  if (symptom_pos_tag$tag[i] %in% verb_list){
    symptom_pos_tag$POS[i]="VERB"
  }
  if (symptom_pos_tag$tag[i] %in% adverb_list){
    symptom_pos_tag$POS[i]="ADVERB"
  }
}

#remove stop words net negation 
symptom_pos_tag=symptom_pos_tag%>%
  filter(!(word %in% stop_words_sub$word))

#remove punctuation 
symptom_pos_tag=symptom_pos_tag %>%
  mutate(word = str_replace_all(word, "[^[:alpha:]]", "")) %>%
  filter(word != "")


### look up synonym for those with NOUN, VERB, ADJ, ADVERB

symptom_syn=symptom_pos_tag %>%
  filter(POS!=0)%>%
  select(c(type, word,POS)) %>%
  distinct()

#take the first synonym that is a single word 
  #if not found for word, check lemma
  #if not found for either, return word 
symptom_syn$lemma=lemmatize_words(symptom_syn$word) #get a lemma for each word
symptom_syn$synonym=0
for(i in 1:nrow(symptom_syn)){
  syn=textclean::replace_contraction(synonyms(symptom_syn$word[i],symptom_syn$POS[i]))
  syn_1word=which(str_count(syn, "\\S+")==1)
  if(length(syn_1word)>0){
    symptom_syn$synonym[i]=syn[syn_1word[1]]
  } else {
    syn=textclean::replace_contraction(synonyms(symptom_syn$lemma[i],symptom_syn$POS[i]))
    syn_1word=which(str_count(syn, "\\S+")==1)
    if(length(syn_1word)>0){
      symptom_syn$synonym[i]=syn[syn_1word[1]]
    } else{
      symptom_syn$synonym[i]=symptom_syn$word[i]
    }
  }
}

#make sure single mapping between type-word and synonym (if multiple matches, go back to word)
single_map_check=symptom_syn%>%
  select(type, word, synonym)%>%
  distinct()%>%
  group_by(type, word)%>%
  mutate(num_syn=length(unique(synonym)))%>%
  ungroup()
if(max(single_map_check$num_syn>1)){
  stop("Ensure only 1 synonym matches to a given type-word pair. If not, replace with word. ")
}

#apply the stem_fn to synonym 
symptom_syn=symptom_syn%>%
  mutate(synonym_stem=stem_fn(synonym, n_stem=3))


#clean up and keep symptom_syn for later mapping 
rm(single_map_check)
rm(symptom_pos_tag, adjective_list, adverb_list, noun_list, verb_list)
rm(i, syn, tags, syn_1word)



#################################################
#Step 3: Get word to "closest" word mapping
#################################################

#vectorize (ok to remove punctuation here)
symptom_filter_tokens=symptom_clean%>%
  unnest_tokens(word, symptom_text, to_lower = T, drop = T, strip_punct=T)

#remove stop words (net negation)
symptom_filter_tokens=anti_join(symptom_filter_tokens,stop_words_sub, by="word")

#########
#For each word, get stem and closest stem 
########

# first extract unique type-word pairs 
symptom_close_stem=symptom_filter_tokens%>%
  select(type, word)%>%distinct()

### for each word, determine stem
symptom_close_stem$stem=stem_fn(symptom_close_stem$word, n_stem = 3)


##### reduce size to get close words more efficiently 
unique_words=symptom_close_stem%>%
  select(word)%>%
  distinct()%>%
  mutate(close_stem=pbsapply(word, getNearestWords2, n_stem=3))

### merge back in to symptom_close_stem
symptom_close_stem=left_join(symptom_close_stem,unique_words,by=c("word"))



#clean up and keep symptom_close_stem for later mapping 
rm(unique_words, symptom_filter_tokens)


#################################################
#Step 4: Merge together symptom_syn and symptom_close_stem to get a DSM type-word to stem, close_stem, syn_stem mapping
#################################################


dsm_word_map=symptom_syn%>%
  select(type, word, synonym_stem)%>%
  full_join(symptom_close_stem, by=c("type","word"))

#if any are missing, replace with stem_fn(word, 3)
dsm_word_map=dsm_word_map%>%
  mutate(stem= if_else(is.na(stem), stem_fn(word, n_stem=3), stem),
         close_stem= if_else(is.na(close_stem), stem_fn(word, n_stem=3), close_stem),
         synonym_stem= if_else(is.na(synonym_stem), stem_fn(word, n_stem=3), synonym_stem))


### save dsm word mapping 
write.csv(dsm_word_map,
          "../data/intermediate/DSM_Vectors.csv", row.names = FALSE)




#################################################
#Step 5: process the patient note text as well 
  #only need to clean, tokenize, remove stop words net negation, and stem function 
#################################################
#clean up 
keep_objects = c("stop_words_sub", 
                 "clean_notes_specific", "stem_fn")
rm(list = setdiff(ls(envir = .GlobalEnv), keep_objects),
   envir = .GlobalEnv)

#read in the cleaned version of the note data
notes_dta_cleaned=fread("../data/intermediate/cleaned_appt_notes_fake.csv",
                        stringsAsFactors = F)

#need to keep only ids and notes 
notes_dta_cleaned=notes_dta_cleaned%>%
  select(pat_id, visit_id, notes)

#tokenize 
notes_dta_cleaned_tokens=notes_dta_cleaned %>%
  unnest_tokens(word, notes, to_lower=T, drop=T, strip_punct=T)

#remove stop words net negation 
notes_dta_cleaned_tokens=anti_join(notes_dta_cleaned_tokens,stop_words_sub, by="word")

#stem 3 times 
notes_dta_cleaned_tokens$stem=stem_fn(notes_dta_cleaned_tokens$word, n_stem = 3)

#save notes vector
write.csv(notes_dta_cleaned_tokens,
          "../data/intermediate/pat_notes_fake_Vectors.csv", row.names = FALSE)

#END OF SCRIPT
